Lots of data exploration inspiration from Michael Griffiths: https://www.kaggle.com/msjgriffiths/exploratory-analysis/code

# Load data
animes <- read.csv('../data/clean/animes.csv', header = TRUE, stringsAsFactors = FALSE)
genres <- read.csv('../data/clean/genres.csv', header = TRUE, stringsAsFactors = FALSE)
ratings <- read.csv('../data/raw/no_null_ratings.csv', header = TRUE)

Let’s look at the top anime by avg rating

N <- 10
df <- data.frame(Anime = animes$name, Rating = animes$rating, stringsAsFactors = TRUE) # want names as factors for plotting

df <- df[order(df$Rating, decreasing = TRUE), ]  # sort by ranking
df$Anime <- factor(df$Anime, levels = df$Anime)  # to retain the order in plot
df <- df[1:N,] # cut off the top N

df %>% ggplot(aes(x=Anime, y=Rating)) + 
  geom_bar(stat="identity", width=.5, fill="tomato3") + 
  coord_cartesian(ylim = c(9.0, 10.0)) +
  labs(title="Top Anime Ratings",
       caption="source: MAL dataset") + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6))

Let’s explore the distribution of ratings among some different groupings

First, does type (Movie, TV show,…) matter?

animes %>% 
  ggplot(aes(rating, group = type)) +
  labs(title = "Average Anime Ratings by Type") +
  geom_density(aes(fill = type), alpha = .4) +
  xlim(0, 10)
## Warning: Removed 230 rows containing non-finite values (stat_density).

Looks like it does! Let’s look more closely at the variation between types

animes %>% 
  group_by(type) %>% 
  summarise(
    average.viewers = mean(members),
    sd.viewers = sd(members),
    average.rating = mean(rating, na.rm = T),
    sd.rating = sd(rating, na.rm = T)
  ) %>% 
  formattable()
type average.viewers sd.viewers average.rating sd.rating
6537.400 13278.495 NaN NA
Movie 10369.094 30898.076 6.318414 1.2119725
Music 1311.840 4548.136 5.588996 0.9584401
ONA 4114.030 12399.959 5.643298 1.1270907
OVA 5986.140 15026.128 6.375221 0.8583584
Special 7676.061 15546.290 6.523501 0.8877620
TV 42683.658 89121.009 6.902299 0.8635256

Explore the genres too!

animes_subset_ratings <- animes %>% select("anime_id", "rating")
genres_with_ratings <- inner_join(genres, animes_subset_ratings, by = "anime_id")

g <- genres_with_ratings %>% 
  ggplot(aes(rating, group = genre)) +
  geom_density(aes(fill = genre), alpha = .4)

ggplotly(g)
## Warning: Removed 690 rows containing non-finite values (stat_density).

No one likes dementia. Harem and Yuri have very solid peaks.

In our dataset, what’s the distribution of user ratings?

g_bar <- ratings %>% ggplot(aes(x = factor(rating))) +
  geom_bar() # Bar plot
ggplotly(g_bar)
# Create a fake grouping variable, for a violin plot of 1 dim 
g_violin <- ratings %>% ggplot(aes(x = factor(0), y = rating)) + 
  geom_violin(trim = FALSE, adjust = 2) + 
  xlab("") + geom_boxplot(width=0.1)
ggplotly(g_violin)
g <- ratings %>% 
  group_by(user_id) %>% 
  summarise(m = mean(rating)) %>% 
  ggplot(aes(m)) +
  geom_density() +
  labs(title = "Distribution of average rating over users")

ggplotly(g)
summary(ratings$rating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   7.000   8.000   7.808   9.000  10.000
ratings.sd <- sd(ratings$rating)
print(ratings.sd)
## [1] 1.572496

School days is a notoriously controversial anime. Is the variance of users’ ratings of this anime higher than it is for most anime?

school_days.id <- filter(animes, name == "School Days")$anime_id
school_days.ratings <- filter(ratings, anime_id == school_days.id)
school_days.sd <- sd(school_days.ratings$rating)
print(ratings.sd)
## [1] 1.572496
print(school_days.sd)
## [1] 2.354353
g <- school_days.ratings %>% ggplot() # Common data throughout all plots

g + geom_bar(aes(x = factor(rating))) # Bar plot

# Create a fake grouping variable, for a boxplot of 1 dim
g + geom_violin(aes(x = factor(0), y = rating), trim = FALSE, adjust = 2) + 
  xlab("") + scale_x_discrete(breaks = NULL)

# Yeah, looks like the # Let’s compute class rankings to find the true Weebs

# Weeb score
MAX_SCORE <- 10 # We assume students would give their favorite animes this score
MIN_SCORE <- 1 # We assume students would give their least fav animes this score

students <- c("Adriana", "Beau", "David", "Fanny", "Joe", "Kevin", "Lilly (Ralf)", "Lydia", "Mac", "Michael", "Noah", "Richard", "Roger", "Saad", "Shane", "Stephanie", "Ty", "Xiaotai")
fav_animes <- c("Psycho-Pass", "One Punch Man", "Cowboy Bebop", "", "FLCL", "Death Note", "Last Exile", "JoJo no Kimyou na Bouken (TV)", "Pokemon", "Tonari no Totoro", "Ginga Eiyuu Densetsu", "Afro Samurai", "Yuri!!! on Ice", "Dragon Ball Z", "JoJo no Kimyou na Bouken: Diamond wa Kudakenai", "Ouran Koukou Host Club", "Mushishi", "Doraemon (1979)")

calculate_culture_score <- function (fav_anime) {
  anime_from_data <- filter(animes, name == fav_anime)
  if (nrow(anime_from_data) == 0) { # no result
    rating <- MIN_SCORE
  } else {
    rating <- anime_from_data$rating
  }
  culture.score <- (MAX_SCORE - rating)^2
  #if (culture.score < 1) {
    #print("Ah, I see you're a man of culture as well.")
  #}
  return(culture.score)
}

weeb.scores <- sapply(fav_animes, calculate_culture_score)
weeb.scores <- unname(weeb.scores)

df <- data.frame(Student = students, Score = weeb.scores, stringsAsFactors = TRUE) # want names as factors for plotting
  
df <- df[order(df$Score), ]  # sort by ranking
df$Student <- factor(df$Student, levels = df$Student)  # to retain the order in plot

ggplot(df, aes(x=Student, y=Score)) + 
  geom_bar(stat="identity", width=.5, fill="tomato3") + 
  labs(title="Class Rankings", 
       subtitle="Culture Score", 
       caption="source: In-Class Survey") + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6))

The scale is thrown off by one NA. Let’s try again removing the outlier.

df <- filter(df, Score < 60)
ggplot(df, aes(x=Student, y=Score)) + 
  geom_bar(stat="identity", width=.5, fill="tomato3") + 
  labs(title="Class Rankings", 
       subtitle="Squared Error of Favorite Anime Rating", 
       caption="source: In-Class Survey") + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6))